Importing
library
## Importing library
### List of required packages
required_packages <- c("tidyverse","janitor" ,"readr","dplyr","haven","sf", "flextable","sp", "factoextra", "FactoMineR","gtsummary", "sjPlot", "fastDummies","ggthemes","spdep","patchwork")
# Check if packages are installed
missing_packages <- setdiff(required_packages, installed.packages()[,"Package"])
### Install missing packages
if (length(missing_packages) > 0) {
install.packages(missing_packages)
}
### Load all packages
lapply(required_packages, library, character.only = TRUE)
# Read shapefile data for 2002 and 2002
MPI_data_dr_2002 <- sf::read_sf(paste0(here::here(),"/output/output_data/MPI_data_dr_2002.shp"))
MPI_data_dr_2013 <- sf::read_sf(paste0(here::here(),"/output/output_data/MPI_data_dr_2013.shp"))
Regression modeling
with Senegal Census data
#"nb_indv",
predictors = c("nbr_mng","nbr_dc_p","nbr_dc_m","nbr_dc_sc","nbr_dc_sp","mdn_cm_","mn_cm_g","nbr_cm_h","nbr_cm_f","pct_cm_")
outcome = "MPI_ndx"
Inspecting the
outcome variable (MPI) with visualization
mhv_map <- ggplot(MPI_data_dr_2013, aes(fill = MPI_ndx)) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
theme_void() +
labs(fill = "MPI ")
mhv_histogram <- ggplot(MPI_data_dr_2013, aes(x = MPI_ndx)) +
geom_histogram(alpha = 0.5, fill = "navy", color = "navy",
bins = 100) +
theme_minimal() +
scale_x_continuous(labels = scales::label_number_si(accuracy = 0.1)) +
labs(x = "MPI")
mhv_map + mhv_histogram + labs(title = "MPI value charts for Senegal Census 2002")

mhv_map_log <- ggplot(MPI_data_dr_2013, aes(fill = log(MPI_ndx))) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
theme_void() +
labs(fill = "MPI\nvalue (log)")
mhv_histogram_log <- ggplot(MPI_data_dr_2013, aes(x = log(MPI_ndx))) +
geom_histogram(alpha = 0.5, fill = "navy", color = "navy",
bins = 100) +
theme_minimal() +
scale_x_continuous() +
labs(x = "MPI (log)")
mhv_map_log + mhv_histogram_log + labs(title = "Logged MPI value charts for Senegal Census 2002")

A first regression
model
library(sf)
library(units)
predictors = c("nbr_mng","nbr_dc_p","nbr_dc_m","nbr_dc_sc","nbr_dc_sp","mn_cm_g","pct_cm_","nb_indv")
outcome = "MPI_ndx"
MPI_data_dr_2013_for_model<- MPI_data_dr_2013 %>%
dplyr::select(MPI_ndx,predictors) %>%
mutate(pop_density = as.numeric(set_units(nb_indv / st_area(.), "1/km2"))) %>%
dplyr::select(-nb_indv)
formula <- "log(MPI_ndx) ~ nbr_mng + nbr_dc_p + nbr_dc_m + nbr_dc_sc + nbr_dc_sp + mn_cm_g + pct_cm_ + pop_density "
model1 <- lm(formula = formula, data = MPI_data_dr_2013_for_model)
summary(model1)
##
## Call:
## lm(formula = formula, data = MPI_data_dr_2013_for_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.25739 -0.15703 0.01659 0.16114 1.04399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.384e-01 2.339e-01 -3.584 0.000377 ***
## nbr_mng 7.733e-03 7.553e-04 10.238 < 2e-16 ***
## nbr_dc_p -9.840e-03 1.992e-03 -4.939 1.13e-06 ***
## nbr_dc_m -2.005e-02 2.772e-03 -7.232 2.22e-12 ***
## nbr_dc_sc -1.396e-02 2.812e-03 -4.963 1.01e-06 ***
## nbr_dc_sp -1.549e-02 1.587e-03 -9.757 < 2e-16 ***
## mn_cm_g -3.719e-02 4.190e-03 -8.877 < 2e-16 ***
## pct_cm_ 6.054e-01 1.565e-01 3.869 0.000126 ***
## pop_density 1.575e-06 8.175e-07 1.926 0.054765 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2692 on 426 degrees of freedom
## Multiple R-squared: 0.6061, Adjusted R-squared: 0.5987
## F-statistic: 81.93 on 8 and 426 DF, p-value: < 2.2e-16
library(corrr)
dfw_estimates <- MPI_data_dr_2013_for_model%>%
select(-MPI_ndx) %>%
st_drop_geometry()
correlations <- correlate(dfw_estimates, method = "pearson")
network_plot(correlations) + labs(title = "Network plot of correlations between model predictors for Senegal Census 2002")

library(car)
vif(model1)
## nbr_mng nbr_dc_p nbr_dc_m nbr_dc_sc nbr_dc_sp mn_cm_g
## 3.555007 2.381862 2.434817 2.639560 2.186414 1.097728
## pct_cm_ pop_density
## 1.079553 1.258178
Dimension reduction
with principal components analysis
pca <- prcomp(
formula = ~.,
data = dfw_estimates,
scale. = TRUE,
center = TRUE
)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.7393 1.2937 1.0202 0.9148 0.78444 0.58939 0.5261
## Proportion of Variance 0.3781 0.2092 0.1301 0.1046 0.07692 0.04342 0.0346
## Cumulative Proportion 0.3781 0.5874 0.7175 0.8220 0.89897 0.94239 0.9770
## PC8
## Standard deviation 0.42903
## Proportion of Variance 0.02301
## Cumulative Proportion 1.00000
pca_tibble <- pca$rotation %>%
as_tibble(rownames = "predictor")
pca_tibble
## # A tibble: 8 × 9
## predictor PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 nbr_mng 0.468 0.344 7.98e-4 -0.106 8.97e-4 -0.231 0.0192 0.773
## 2 nbr_dc_p 0.252 0.586 -5.06e-2 -0.280 -3.62e-1 -0.290 0.0705 -0.540
## 3 nbr_dc_m 0.499 0.0175 -3.74e-2 -0.0967 -4.11e-2 0.692 -0.500 -0.104
## 4 nbr_dc_sc 0.488 -0.196 -6.42e-3 0.0450 2.07e-1 0.212 0.780 -0.158
## 5 nbr_dc_sp 0.428 -0.289 -2.65e-2 0.213 4.24e-1 -0.552 -0.368 -0.258
## 6 mn_cm_g 0.0145 -0.436 2.37e-1 -0.859 -2.17e-2 -0.119 -0.0223 0.0322
## 7 pct_cm_ 0.145 -0.120 8.37e-1 0.311 -4.07e-1 -0.0468 -0.0122 -0.00608
## 8 pop_densi… -0.162 0.459 4.89e-1 -0.140 6.92e-1 0.134 -0.00901 -0.0864
pca_tibble %>%
select(predictor:PC5) %>%
pivot_longer(PC1:PC5, names_to = "component", values_to = "value") %>%
ggplot(aes(x = value, y = predictor)) +
geom_col(fill = "darkgreen", color = "darkgreen", alpha = 0.5) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL, x = "Value", title = " Loadings for first five principal components for Senegal Census 2002") +
theme_minimal()

components <- predict(pca, dfw_estimates)
dfw_pca <- MPI_data_dr_2013_for_model%>%
select(MPI_ndx) %>%
cbind(components)
ggplot(dfw_pca, aes(fill = PC1)) +
geom_sf(color = NA) +
labs(title = "Map of principal component 1 for Senegal Census 2002") +
theme_void() +
scale_fill_viridis_c()

pca_formula <- paste0("log(MPI_ndx) ~ ",
paste0('PC', 1:6, collapse = ' + '))
pca_model <- lm(formula = pca_formula, data = dfw_pca)
summary(pca_model)
##
## Call:
## lm(formula = pca_formula, data = dfw_pca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41657 -0.17138 0.02118 0.18770 1.31306
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.191601 0.014555 -150.578 < 2e-16 ***
## PC1 -0.110044 0.008378 -13.135 < 2e-16 ***
## PC2 0.158967 0.011264 14.113 < 2e-16 ***
## PC3 0.044954 0.014283 3.147 0.00176 **
## PC4 0.086249 0.015929 5.415 1.03e-07 ***
## PC5 -0.058104 0.018576 -3.128 0.00188 **
## PC6 -0.033497 0.024723 -1.355 0.17617
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3036 on 428 degrees of freedom
## Multiple R-squared: 0.4968, Adjusted R-squared: 0.4898
## F-statistic: 70.43 on 6 and 428 DF, p-value: < 2.2e-16
Spatial
regression
MPI_data_dr_2013_for_model$residuals <- residuals(model1)
ggplot(MPI_data_dr_2013_for_model, aes(x = residuals)) +
geom_histogram(bins = 100, alpha = 0.5, color = "navy",
fill = "navy") +
labs(title = "Distribution of model residuals for Senegal Census 2002") +
theme_minimal()

library(spdep)
wts <- MPI_data_dr_2013_for_model %>%
poly2nb() %>%
nb2listw()
moran.test(MPI_data_dr_2013_for_model$residuals, wts)
##
## Moran I test under randomisation
##
## data: MPI_data_dr_2013_for_model$residuals
## weights: wts
##
## Moran I statistic standard deviate = 5.0381, p-value = 2.351e-07
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.1417648099 -0.0023041475 0.0008177211
MPI_data_dr_2013_for_model$lagged_residuals <- lag.listw(wts, MPI_data_dr_2013_for_model$residuals)
ggplot(MPI_data_dr_2013_for_model, aes(x = residuals, y = lagged_residuals)) +
theme_minimal() +
labs(title = "Moran scatterplot of residual spatial autocorrelation for Senegal Census 2002") +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red")

Geographically weighted
regression
Choosing a bandwidth
for GWR
library(GWmodel)
library(sf)
dfw_data_sp <- MPI_data_dr_2013_for_model%>%
as_Spatial()
bw <- bw.gwr(
formula = formula,
data = dfw_data_sp,
kernel = "bisquare",
adaptive = TRUE
)
## Adaptive bandwidth: 276 CV score: 30.72873
## Adaptive bandwidth: 179 CV score: 30.5402
## Adaptive bandwidth: 117 CV score: 31.56878
## Adaptive bandwidth: 215 CV score: 30.53929
## Adaptive bandwidth: 240 CV score: 30.59468
## Adaptive bandwidth: 202 CV score: 30.50807
## Adaptive bandwidth: 191 CV score: 30.50642
## Adaptive bandwidth: 187 CV score: 30.51038
## Adaptive bandwidth: 196 CV score: 30.50472
## Adaptive bandwidth: 196 CV score: 30.50472
gw_model <- gwr.basic(
formula = formula,
data = dfw_data_sp,
bw = bw,
kernel = "bisquare",
adaptive = TRUE
)
names(gw_model)
## [1] "GW.arguments" "GW.diagnostic" "lm" "SDF"
## [5] "timings" "this.call" "Ftests"
gw_model_results <- gw_model$SDF %>%
st_as_sf()
names(gw_model_results)
## [1] "Intercept" "nbr_mng" "nbr_dc_p" "nbr_dc_m"
## [5] "nbr_dc_sc" "nbr_dc_sp" "mn_cm_g" "pct_cm_"
## [9] "pop_density" "y" "yhat" "residual"
## [13] "CV_Score" "Stud_residual" "Intercept_SE" "nbr_mng_SE"
## [17] "nbr_dc_p_SE" "nbr_dc_m_SE" "nbr_dc_sc_SE" "nbr_dc_sp_SE"
## [21] "mn_cm_g_SE" "pct_cm__SE" "pop_density_SE" "Intercept_TV"
## [25] "nbr_mng_TV" "nbr_dc_p_TV" "nbr_dc_m_TV" "nbr_dc_sc_TV"
## [29] "nbr_dc_sp_TV" "mn_cm_g_TV" "pct_cm__TV" "pop_density_TV"
## [33] "Local_R2" "geometry"
ggplot(gw_model_results, aes(fill = Local_R2)) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
labs(title = "Local R-squared values from the GWR model for Senegal Census 2002") +
theme_void()

ggplot(gw_model_results, aes(fill = nbr_mng)) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
labs(title = "Local parameter estimates for household members for Senegal Census 2002") +
theme_void() +
labs(fill = "Local β for \nHH")

ggplot(gw_model_results, aes(fill = pop_density)) +
geom_sf(color = NA) +
scale_fill_viridis_c() +
labs(title = "Local parameter estimates for population density for Senegal Census 2002") +
theme_void() +
labs(fill = "Local β for \npopulation density")

Classification and
clustering of Senegal census data
Geodemographic
classification
set.seed(1983)
dfw_kmeans <- dfw_pca %>%
st_drop_geometry() %>%
select(PC1:PC8) %>%
kmeans(centers = 6)
table(dfw_kmeans$cluster)
##
## 1 2 3 4 5 6
## 20 83 101 89 36 106
dfw_clusters <- dfw_pca %>%
mutate(cluster = as.character(dfw_kmeans$cluster))
ggplot(dfw_clusters, aes(fill = cluster)) +
geom_sf(size = 0.1) +
scale_fill_brewer(palette = "Set1") +
labs(title = "Map of geodemographic clusters for Senegal Census 2002") +
theme_void() +
labs(fill = "Cluster ")

library(plotly)
cluster_plot <- ggplot(dfw_clusters,
aes(x = PC1, y = PC2, color = cluster)) +
geom_point() +
scale_color_brewer(palette = "Set1") +
labs(title = "Interactive scatterplot of PC1 and PC2 colored by cluster for Senegal Census 2002") +
theme_minimal()
ggplotly(cluster_plot) %>%
layout(legend = list(orientation = "h", y = -0.15,
x = 0.2, title = "Cluster"))
Spatial clustering
& regionalization
library(spdep)
input_vars <- dfw_pca %>%
select(PC1:PC8) %>%
st_drop_geometry() %>%
as.data.frame()
skater_nbrs <- poly2nb(dfw_pca, queen = TRUE)
costs <- nbcosts(skater_nbrs, input_vars)
skater_weights <- nb2listw(skater_nbrs, costs, style = "B")
mst <- mstree(skater_weights)
regions <- skater(
mst[,1:2],
input_vars,
ncuts = 7,
crit = 10
)
dfw_clusters$region <- as.character(regions$group)
ggplot(dfw_clusters, aes(fill = region)) +
geom_sf(size = 0.1) +
labs(title = "Map of contiguous regions derived with the SKATER algorithm for Senegal Census 2002") +
scale_fill_brewer(palette = "Set1") +
theme_void()
